{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "ratings = pd.read_csv(\"datasets/ml-latest/ratings.csv\", usecols=['movieId', 'rating'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "genome_scores = pd.read_csv(\"datasets/ml-latest/genome-scores.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "genome_tags = pd.read_csv(\"datasets/ml-latest/genome-tags.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "movie_names = pd.read_csv(\"datasets/ml-latest/movies.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "links = pd.read_csv(\"datasets/ml-latest/links.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "movie_tags_in_text = pd.merge(genome_scores, genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Determine a good tag relevancy score cut-off"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>tagId</th>\n",
       "      <th>relevance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.02400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0.02400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0.05475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0.09200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0.14825</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId  tagId  relevance\n",
       "0        1      1    0.02400\n",
       "1        1      2    0.02400\n",
       "2        1      3    0.05475\n",
       "3        1      4    0.09200\n",
       "4        1      5    0.14825"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genome_scores.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>tagId</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>...</th>\n",
       "      <th>1119</th>\n",
       "      <th>1120</th>\n",
       "      <th>1121</th>\n",
       "      <th>1122</th>\n",
       "      <th>1123</th>\n",
       "      <th>1124</th>\n",
       "      <th>1125</th>\n",
       "      <th>1126</th>\n",
       "      <th>1127</th>\n",
       "      <th>1128</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>movieId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.02400</td>\n",
       "      <td>0.02400</td>\n",
       "      <td>0.05475</td>\n",
       "      <td>0.09200</td>\n",
       "      <td>0.14825</td>\n",
       "      <td>0.21500</td>\n",
       "      <td>0.06625</td>\n",
       "      <td>0.27025</td>\n",
       "      <td>0.26050</td>\n",
       "      <td>0.03025</td>\n",
       "      <td>...</td>\n",
       "      <td>0.03650</td>\n",
       "      <td>0.01800</td>\n",
       "      <td>0.04525</td>\n",
       "      <td>0.03275</td>\n",
       "      <td>0.12450</td>\n",
       "      <td>0.04175</td>\n",
       "      <td>0.02000</td>\n",
       "      <td>0.03475</td>\n",
       "      <td>0.08350</td>\n",
       "      <td>0.02525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.03800</td>\n",
       "      <td>0.04175</td>\n",
       "      <td>0.03700</td>\n",
       "      <td>0.04875</td>\n",
       "      <td>0.11075</td>\n",
       "      <td>0.07325</td>\n",
       "      <td>0.04950</td>\n",
       "      <td>0.10775</td>\n",
       "      <td>0.10200</td>\n",
       "      <td>0.02050</td>\n",
       "      <td>...</td>\n",
       "      <td>0.03900</td>\n",
       "      <td>0.01925</td>\n",
       "      <td>0.01725</td>\n",
       "      <td>0.02425</td>\n",
       "      <td>0.13425</td>\n",
       "      <td>0.02225</td>\n",
       "      <td>0.01600</td>\n",
       "      <td>0.01450</td>\n",
       "      <td>0.09600</td>\n",
       "      <td>0.02025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.04200</td>\n",
       "      <td>0.05250</td>\n",
       "      <td>0.02725</td>\n",
       "      <td>0.07975</td>\n",
       "      <td>0.05625</td>\n",
       "      <td>0.07025</td>\n",
       "      <td>0.05975</td>\n",
       "      <td>0.18275</td>\n",
       "      <td>0.05175</td>\n",
       "      <td>0.02725</td>\n",
       "      <td>...</td>\n",
       "      <td>0.03950</td>\n",
       "      <td>0.02625</td>\n",
       "      <td>0.02725</td>\n",
       "      <td>0.03450</td>\n",
       "      <td>0.16925</td>\n",
       "      <td>0.03525</td>\n",
       "      <td>0.01725</td>\n",
       "      <td>0.01875</td>\n",
       "      <td>0.09925</td>\n",
       "      <td>0.02000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.03600</td>\n",
       "      <td>0.03850</td>\n",
       "      <td>0.03500</td>\n",
       "      <td>0.03125</td>\n",
       "      <td>0.07100</td>\n",
       "      <td>0.04500</td>\n",
       "      <td>0.02475</td>\n",
       "      <td>0.08300</td>\n",
       "      <td>0.05150</td>\n",
       "      <td>0.02975</td>\n",
       "      <td>...</td>\n",
       "      <td>0.05375</td>\n",
       "      <td>0.03300</td>\n",
       "      <td>0.02275</td>\n",
       "      <td>0.04025</td>\n",
       "      <td>0.19600</td>\n",
       "      <td>0.05700</td>\n",
       "      <td>0.01550</td>\n",
       "      <td>0.01475</td>\n",
       "      <td>0.06625</td>\n",
       "      <td>0.01400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.04075</td>\n",
       "      <td>0.05125</td>\n",
       "      <td>0.05800</td>\n",
       "      <td>0.03675</td>\n",
       "      <td>0.07575</td>\n",
       "      <td>0.12675</td>\n",
       "      <td>0.02975</td>\n",
       "      <td>0.08175</td>\n",
       "      <td>0.03075</td>\n",
       "      <td>0.02950</td>\n",
       "      <td>...</td>\n",
       "      <td>0.04000</td>\n",
       "      <td>0.02850</td>\n",
       "      <td>0.02100</td>\n",
       "      <td>0.02650</td>\n",
       "      <td>0.15475</td>\n",
       "      <td>0.02050</td>\n",
       "      <td>0.01700</td>\n",
       "      <td>0.01575</td>\n",
       "      <td>0.11275</td>\n",
       "      <td>0.01975</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1128 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "tagId       1        2        3        4        5        6        7     \\\n",
       "movieId                                                                  \n",
       "1        0.02400  0.02400  0.05475  0.09200  0.14825  0.21500  0.06625   \n",
       "2        0.03800  0.04175  0.03700  0.04875  0.11075  0.07325  0.04950   \n",
       "3        0.04200  0.05250  0.02725  0.07975  0.05625  0.07025  0.05975   \n",
       "4        0.03600  0.03850  0.03500  0.03125  0.07100  0.04500  0.02475   \n",
       "5        0.04075  0.05125  0.05800  0.03675  0.07575  0.12675  0.02975   \n",
       "\n",
       "tagId       8        9        10     ...        1119     1120     1121  \\\n",
       "movieId                              ...                                 \n",
       "1        0.27025  0.26050  0.03025   ...     0.03650  0.01800  0.04525   \n",
       "2        0.10775  0.10200  0.02050   ...     0.03900  0.01925  0.01725   \n",
       "3        0.18275  0.05175  0.02725   ...     0.03950  0.02625  0.02725   \n",
       "4        0.08300  0.05150  0.02975   ...     0.05375  0.03300  0.02275   \n",
       "5        0.08175  0.03075  0.02950   ...     0.04000  0.02850  0.02100   \n",
       "\n",
       "tagId       1122     1123     1124     1125     1126     1127     1128  \n",
       "movieId                                                                 \n",
       "1        0.03275  0.12450  0.04175  0.02000  0.03475  0.08350  0.02525  \n",
       "2        0.02425  0.13425  0.02225  0.01600  0.01450  0.09600  0.02025  \n",
       "3        0.03450  0.16925  0.03525  0.01725  0.01875  0.09925  0.02000  \n",
       "4        0.04025  0.19600  0.05700  0.01550  0.01475  0.06625  0.01400  \n",
       "5        0.02650  0.15475  0.02050  0.01700  0.01575  0.11275  0.01975  \n",
       "\n",
       "[5 rows x 1128 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genome_scores[:100000].pivot(index='movieId', columns='tagId')['relevance'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "table = genome_scores[:100000].pivot_table('relevance', index='movieId', columns='tagId', aggfunc='mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>tagId</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>...</th>\n",
       "      <th>1119</th>\n",
       "      <th>1120</th>\n",
       "      <th>1121</th>\n",
       "      <th>1122</th>\n",
       "      <th>1123</th>\n",
       "      <th>1124</th>\n",
       "      <th>1125</th>\n",
       "      <th>1126</th>\n",
       "      <th>1127</th>\n",
       "      <th>1128</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>movieId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.02400</td>\n",
       "      <td>0.02400</td>\n",
       "      <td>0.05475</td>\n",
       "      <td>0.09200</td>\n",
       "      <td>0.14825</td>\n",
       "      <td>0.21500</td>\n",
       "      <td>0.06625</td>\n",
       "      <td>0.27025</td>\n",
       "      <td>0.26050</td>\n",
       "      <td>0.03025</td>\n",
       "      <td>...</td>\n",
       "      <td>0.03650</td>\n",
       "      <td>0.01800</td>\n",
       "      <td>0.04525</td>\n",
       "      <td>0.03275</td>\n",
       "      <td>0.12450</td>\n",
       "      <td>0.04175</td>\n",
       "      <td>0.02000</td>\n",
       "      <td>0.03475</td>\n",
       "      <td>0.08350</td>\n",
       "      <td>0.02525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.03800</td>\n",
       "      <td>0.04175</td>\n",
       "      <td>0.03700</td>\n",
       "      <td>0.04875</td>\n",
       "      <td>0.11075</td>\n",
       "      <td>0.07325</td>\n",
       "      <td>0.04950</td>\n",
       "      <td>0.10775</td>\n",
       "      <td>0.10200</td>\n",
       "      <td>0.02050</td>\n",
       "      <td>...</td>\n",
       "      <td>0.03900</td>\n",
       "      <td>0.01925</td>\n",
       "      <td>0.01725</td>\n",
       "      <td>0.02425</td>\n",
       "      <td>0.13425</td>\n",
       "      <td>0.02225</td>\n",
       "      <td>0.01600</td>\n",
       "      <td>0.01450</td>\n",
       "      <td>0.09600</td>\n",
       "      <td>0.02025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.04200</td>\n",
       "      <td>0.05250</td>\n",
       "      <td>0.02725</td>\n",
       "      <td>0.07975</td>\n",
       "      <td>0.05625</td>\n",
       "      <td>0.07025</td>\n",
       "      <td>0.05975</td>\n",
       "      <td>0.18275</td>\n",
       "      <td>0.05175</td>\n",
       "      <td>0.02725</td>\n",
       "      <td>...</td>\n",
       "      <td>0.03950</td>\n",
       "      <td>0.02625</td>\n",
       "      <td>0.02725</td>\n",
       "      <td>0.03450</td>\n",
       "      <td>0.16925</td>\n",
       "      <td>0.03525</td>\n",
       "      <td>0.01725</td>\n",
       "      <td>0.01875</td>\n",
       "      <td>0.09925</td>\n",
       "      <td>0.02000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.03600</td>\n",
       "      <td>0.03850</td>\n",
       "      <td>0.03500</td>\n",
       "      <td>0.03125</td>\n",
       "      <td>0.07100</td>\n",
       "      <td>0.04500</td>\n",
       "      <td>0.02475</td>\n",
       "      <td>0.08300</td>\n",
       "      <td>0.05150</td>\n",
       "      <td>0.02975</td>\n",
       "      <td>...</td>\n",
       "      <td>0.05375</td>\n",
       "      <td>0.03300</td>\n",
       "      <td>0.02275</td>\n",
       "      <td>0.04025</td>\n",
       "      <td>0.19600</td>\n",
       "      <td>0.05700</td>\n",
       "      <td>0.01550</td>\n",
       "      <td>0.01475</td>\n",
       "      <td>0.06625</td>\n",
       "      <td>0.01400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.04075</td>\n",
       "      <td>0.05125</td>\n",
       "      <td>0.05800</td>\n",
       "      <td>0.03675</td>\n",
       "      <td>0.07575</td>\n",
       "      <td>0.12675</td>\n",
       "      <td>0.02975</td>\n",
       "      <td>0.08175</td>\n",
       "      <td>0.03075</td>\n",
       "      <td>0.02950</td>\n",
       "      <td>...</td>\n",
       "      <td>0.04000</td>\n",
       "      <td>0.02850</td>\n",
       "      <td>0.02100</td>\n",
       "      <td>0.02650</td>\n",
       "      <td>0.15475</td>\n",
       "      <td>0.02050</td>\n",
       "      <td>0.01700</td>\n",
       "      <td>0.01575</td>\n",
       "      <td>0.11275</td>\n",
       "      <td>0.01975</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1128 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "tagId       1        2        3        4        5        6        7     \\\n",
       "movieId                                                                  \n",
       "1        0.02400  0.02400  0.05475  0.09200  0.14825  0.21500  0.06625   \n",
       "2        0.03800  0.04175  0.03700  0.04875  0.11075  0.07325  0.04950   \n",
       "3        0.04200  0.05250  0.02725  0.07975  0.05625  0.07025  0.05975   \n",
       "4        0.03600  0.03850  0.03500  0.03125  0.07100  0.04500  0.02475   \n",
       "5        0.04075  0.05125  0.05800  0.03675  0.07575  0.12675  0.02975   \n",
       "\n",
       "tagId       8        9        10     ...        1119     1120     1121  \\\n",
       "movieId                              ...                                 \n",
       "1        0.27025  0.26050  0.03025   ...     0.03650  0.01800  0.04525   \n",
       "2        0.10775  0.10200  0.02050   ...     0.03900  0.01925  0.01725   \n",
       "3        0.18275  0.05175  0.02725   ...     0.03950  0.02625  0.02725   \n",
       "4        0.08300  0.05150  0.02975   ...     0.05375  0.03300  0.02275   \n",
       "5        0.08175  0.03075  0.02950   ...     0.04000  0.02850  0.02100   \n",
       "\n",
       "tagId       1122     1123     1124     1125     1126     1127     1128  \n",
       "movieId                                                                 \n",
       "1        0.03275  0.12450  0.04175  0.02000  0.03475  0.08350  0.02525  \n",
       "2        0.02425  0.13425  0.02225  0.01600  0.01450  0.09600  0.02025  \n",
       "3        0.03450  0.16925  0.03525  0.01725  0.01875  0.09925  0.02000  \n",
       "4        0.04025  0.19600  0.05700  0.01550  0.01475  0.06625  0.01400  \n",
       "5        0.02650  0.15475  0.02050  0.01700  0.01575  0.11275  0.01975  \n",
       "\n",
       "[5 rows x 1128 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1065d7550>]], dtype=object)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEwNJREFUeJzt3X+MndV95/H3JzgkKUNtEtIRwt46VWi7ESgpjAJRV92Z\neLc1pIqRmiIiUgzyrqVuEqWb7ArvbqXuj0rrqKJRYKO0bokwFe2E0qa2CGmFHEYoqzVbu0kxgXYz\noSa1y9qb2Hh3AkmX7nf/uA/Ziddw78zcmes5835Jo/s85znPPec7Hn/uM+f+mFQVkqR2vWbUE5Ak\nLS+DXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJfOIcmHkhxK8t0k9456PtJSrBv1BKTz1N8A\nvwr8DPCGEc9FWhKDXjqHqvpDgCQTwMYRT0daEpduJKlxBr0kNc6gl6TGGfSS1DifjJXOIck6ev8/\nLgAuSPJ64KWqemm0M5MWzit66dx+GXgR2AV8oNv+5ZHOSFqk+IdHJKltXtFLUuMMeklqnEEvSY0z\n6CWpcefFyysvvfTS2rx586LO/fa3v81FF1003Amd56x5bbDmtWEpNR8+fPibVfXmfv3Oi6DfvHkz\nhw4dWtS5MzMzTE5ODndC5zlrXhuseW1YSs1Jnh2kn0s3ktQ4g16SGmfQS1LjDHpJapxBL0mNM+gl\nqXEGvSQ1zqCXpMYZ9JLUuPPinbFLceT4GW7b9fmRjH1093tGMq4kLYRX9JLUOINekhpn0EtS4wx6\nSWqcQS9JjTPoJalxBr0kNW6goE+yIcmDSf4iydNJ3pXkjUkeSfK17vaSrm+S3JVkNskTSa5e3hIk\nSa9m0Cv6TwJ/XFU/DrwdeBrYBRyoqiuAA90+wPXAFd3XTuDTQ52xJGlB+gZ9kvXATwH3AFTV31bV\n88A2YG/XbS9wY7e9Dbiveg4CG5JcNvSZS5IGkqp69Q7JO4A9wFP0ruYPAx8BjlfVhq5PgNNVtSHJ\nQ8DuqvpSd+wAcEdVHTrrfnfSu+JnfHz8munp6UUVcPLUGU68uKhTl+yqy9ePZNy5uTnGxsZGMvao\nWPPaYM0LMzU1dbiqJvr1G+SzbtYBVwMfrqrHk3yS/7dMA0BVVZJXf8Q4S1XtofcAwsTERC32r6Df\nff8+7jwymo/sOXrL5EjGXcpfjV+trHltsOblMcga/THgWFU93u0/SC/4T7y8JNPdnuyOHwc2zTt/\nY9cmSRqBvkFfVf8d+OskP9Y1baG3jLMf2N61bQf2ddv7gVu7V99cB5ypqueGO21J0qAGXfP4MHB/\nkguBZ4Db6T1IPJBkB/AscFPX92HgBmAWeKHrK0kakYGCvqq+ApxrwX/LOfoW8MElzkuSNCS+M1aS\nGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalx\nBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxg0U9EmOJjmS5CtJ\nDnVtb0zySJKvdbeXdO1JcleS2SRPJLl6OQuQJL26hVzRT1XVO6pqotvfBRyoqiuAA90+wPXAFd3X\nTuDTw5qsJGnhlrJ0sw3Y223vBW6c135f9RwENiS5bAnjSJKWIFXVv1PyV8BpoIDfrKo9SZ6vqg3d\n8QCnq2pDkoeA3VX1pe7YAeCOqjp01n3upHfFz/j4+DXT09OLKuDkqTOceHFRpy7ZVZevH8m4c3Nz\njI2NjWTsUbHmtcGaF2ZqaurwvFWWV7RuwPv7B1V1PMkPAY8k+Yv5B6uqkvR/xPj+c/YAewAmJiZq\ncnJyIad/z9337+POI4OWMVxHb5kcybgzMzMs9vu1Wlnz2mDNy2OgpZuqOt7dngQ+B7wTOPHykkx3\ne7LrfhzYNO/0jV2bJGkE+gZ9kouSXPzyNvDTwJPAfmB71207sK/b3g/c2r365jrgTFU9N/SZS5IG\nMsiaxzjwud4yPOuA362qP07yp8ADSXYAzwI3df0fBm4AZoEXgNuHPmtJ0sD6Bn1VPQO8/Rzt3wK2\nnKO9gA8OZXaSpCXznbGS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16S\nGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalx\nBr0kNc6gl6TGDRz0SS5I8uUkD3X7b0nyeJLZJJ9NcmHX/rpuf7Y7vnl5pi5JGsRCrug/Ajw9b//j\nwCeq6q3AaWBH174DON21f6LrJ0kakYGCPslG4D3Ab3f7Ad4NPNh12Qvc2G1v6/bpjm/p+kuSRiBV\n1b9T8iDwH4GLgX8B3AYc7K7aSbIJ+EJVXZnkSWBrVR3rjn0duLaqvnnWfe4EdgKMj49fMz09vagC\nTp46w4kXF3Xqkl11+fqRjDs3N8fY2NhIxh4Va14brHlhpqamDlfVRL9+6/p1SPKzwMmqOpxkclGz\nOYeq2gPsAZiYmKjJycXd9d337+POI33LWBZHb5kcybgzMzMs9vu1Wlnz2mDNy2OQhPxJ4L1JbgBe\nD/wg8ElgQ5J1VfUSsBE43vU/DmwCjiVZB6wHvjX0mUuSBtJ3jb6q/lVVbayqzcDNwBer6hbgUeB9\nXbftwL5ue3+3T3f8izXI+pAkaVks5XX0dwAfTTILvAm4p2u/B3hT1/5RYNfSpihJWooFLW5X1Qww\n020/A7zzHH2+A/z8EOYmSRoC3xkrSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxB\nL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS\n1DiDXpIaZ9BLUuMMeklqXN+gT/L6JP81yZ8n+WqSf9e1vyXJ40lmk3w2yYVd++u6/dnu+OblLUGS\n9GoGuaL/LvDuqno78A5ga5LrgI8Dn6iqtwKngR1d/x3A6a79E10/SdKI9A366pnrdl/bfRXwbuDB\nrn0vcGO3va3bpzu+JUmGNmNJ0oKkqvp3Si4ADgNvBT4F/BpwsLtqJ8km4AtVdWWSJ4GtVXWsO/Z1\n4Nqq+uZZ97kT2AkwPj5+zfT09KIKOHnqDCdeXNSpS3bV5etHMu7c3BxjY2MjGXtUrHltsOaFmZqa\nOlxVE/36rRvkzqrq74B3JNkAfA748UXN6vvvcw+wB2BiYqImJycXdT9337+PO48MVMbQHb1lciTj\nzszMsNjv12plzWuDNS+PBb3qpqqeBx4F3gVsSPJywm4Ejnfbx4FNAN3x9cC3hjJbSdKCDfKqmzd3\nV/IkeQPwj4Gn6QX++7pu24F93fb+bp/u+BdrkPUhSdKyGGTN4zJgb7dO/xrggap6KMlTwHSSXwW+\nDNzT9b8H+J0ks8Ap4OZlmLckaUB9g76qngB+4hztzwDvPEf7d4CfH8rsJElL5jtjJalxBr0kNc6g\nl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJ\napxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWpc36BPsinJo0meSvLVJB/p2t+Y\n5JEkX+tuL+nak+SuJLNJnkhy9XIXIUl6ZesG6PMS8LGq+rMkFwOHkzwC3AYcqKrdSXYBu4A7gOuB\nK7qva4FPd7fN2bzr8yMZ996tF41kXEmrU98r+qp6rqr+rNv+X8DTwOXANmBv120vcGO3vQ24r3oO\nAhuSXDb0mUuSBpKqGrxzshl4DLgS+EZVbejaA5yuqg1JHgJ2V9WXumMHgDuq6tBZ97UT2AkwPj5+\nzfT09KIKOHnqDCdeXNSpq9Zb1l/A2NjYqKexoubm5qx5DbDmhZmamjpcVRP9+g2ydANAkjHgD4Bf\nqqr/2cv2nqqqJIM/YvTO2QPsAZiYmKjJycmFnP49d9+/jzuPDFxGE+7dehGL/X6tVjMzM9a8Bljz\n8hjoVTdJXksv5O+vqj/smk+8vCTT3Z7s2o8Dm+advrFrkySNwCCvuglwD/B0Vf36vEP7ge3d9nZg\n37z2W7tX31wHnKmq54Y4Z0nSAgyy5vGTwC8AR5J8pWv718Bu4IEkO4BngZu6Yw8DNwCzwAvA7UOd\nsSRpQfoGffekal7h8JZz9C/gg0uclyRpSHxnrCQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9J\njTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4\ng16SGmfQS1LjDHpJapxBL0mNM+glqXF9gz7JZ5KcTPLkvLY3Jnkkyde620u69iS5K8lskieSXL2c\nk5ck9bdugD73Av8JuG9e2y7gQFXtTrKr278DuB64ovu6Fvh0d6shOnL8DLft+vxIxj66+z0jGVfS\n4vW9oq+qx4BTZzVvA/Z223uBG+e131c9B4ENSS4b1mQlSQuXqurfKdkMPFRVV3b7z1fVhm47wOmq\n2pDkIWB3VX2pO3YAuKOqDp3jPncCOwHGx8evmZ6eXlQBJ0+d4cSLizp11Rp/AyOr+arL149k3Lm5\nOcbGxkYy9qhY89qwlJqnpqYOV9VEv36DLN28qqqqJP0fLf7/8/YAewAmJiZqcnJyUePfff8+7jyy\n5DJWlY9d9dLIaj56y+RIxp2ZmWGxPyOrlTWvDStR82JfdXPi5SWZ7vZk134c2DSv38auTZI0IosN\n+v3A9m57O7BvXvut3atvrgPOVNVzS5yjJGkJ+v7+n+T3gEng0iTHgF8BdgMPJNkBPAvc1HV/GLgB\nmAVeAG5fhjlLkhagb9BX1ftf4dCWc/Qt4INLnZQkaXh8Z6wkNc6gl6TGGfSS1DiDXpIaZ9BLUuMM\neklqnEEvSY0z6CWpcQa9JDVubX3so5Zs84j+4Mm9Wy8aybhSC7yil6TGGfSS1DiDXpIaZ9BLUuMM\neklqnK+60apw5PgZbhvBK36O7n7Pio8pDZtX9JLUOINekhpn0EtS41yjl17FqN4JDL4bWMPjFb0k\nNc4reuk85SuNNCwGvaTzxqge3EZpJZboDHpJ32eUz0t87KqRDd20ZVmjT7I1yV8mmU2yaznGkCQN\nZuhBn+QC4FPA9cDbgPcneduwx5EkDWY5rujfCcxW1TNV9bfANLBtGcaRJA0gVTXcO0zeB2ytqn/S\n7f8CcG1VfeisfjuBnd3ujwF/ucghLwW+uchzVytrXhuseW1YSs0/XFVv7tdpZE/GVtUeYM9S7yfJ\noaqaGMKUVg1rXhuseW1YiZqXY+nmOLBp3v7Grk2SNALLEfR/ClyR5C1JLgRuBvYvwziSpAEMfemm\nql5K8iHgT4ALgM9U1VeHPc48S17+WYWseW2w5rVh2Wse+pOxkqTzix9qJkmNM+glqXGrJuj7faxC\nktcl+Wx3/PEkm1d+lsM1QM0fTfJUkieSHEjyw6OY5zAN+vEZSX4uSSVZ9S/FG6TmJDd1/9ZfTfK7\nKz3HYRvgZ/vvJXk0yZe7n+8bRjHPYUnymSQnkzz5CseT5K7u+/FEkquHOoGqOu+/6D2p+3XgR4AL\ngT8H3nZWn38G/Ea3fTPw2VHPewVqngJ+oNv+xbVQc9fvYuAx4CAwMep5r8C/8xXAl4FLuv0fGvW8\nV6DmPcAvdttvA46Oet5LrPmngKuBJ1/h+A3AF4AA1wGPD3P81XJFP8jHKmwD9nbbDwJbkmQF5zhs\nfWuuqker6oVu9yC99yysZoN+fMZ/AD4OfGclJ7dMBqn5nwKfqqrTAFV1coXnOGyD1FzAD3bb64G/\nWcH5DV1VPQacepUu24D7qucgsCHJZcMaf7UE/eXAX8/bP9a1nbNPVb0EnAHetCKzWx6D1DzfDnpX\nBKtZ35q7X2k3VVUrH1o+yL/zjwI/muQ/JzmYZOuKzW55DFLzvwU+kOQY8DDw4ZWZ2sgs9P/7gvh5\n9A1I8gFgAviHo57LckryGuDXgdtGPJWVto7e8s0kvd/aHktyVVU9P9JZLa/3A/dW1Z1J3gX8TpIr\nq+r/jHpiq9FquaIf5GMVvtcnyTp6v+59a0VmtzwG+iiJJP8I+DfAe6vquys0t+XSr+aLgSuBmSRH\n6a1l7l/lT8gO8u98DNhfVf+7qv4K+G/0gn+1GqTmHcADAFX1X4DX0/vwr1Yt60fHrJagH+RjFfYD\n27vt9wFfrO5ZjlWqb81JfgL4TXohv9rXbaFPzVV1pqourarNVbWZ3vMS762qQ6OZ7lAM8rP9R/Su\n5klyKb2lnGdWcpJDNkjN3wC2ACT5+/SC/n+s6CxX1n7g1u7VN9cBZ6rquWHd+apYuqlX+FiFJP8e\nOFRV+4F76P16N0vvSY+bRzfjpRuw5l8DxoDf7553/kZVvXdkk16iAWtuyoA1/wnw00meAv4O+JdV\ntWp/Wx2w5o8Bv5Xkn9N7Yva21XzhluT36D1YX9o97/ArwGsBquo36D0PcQMwC7wA3D7U8Vfx906S\nNIDVsnQjSVokg16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ17v8Cmf/ETUsBdJcAAAAASUVORK5C\nYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1065b7710>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "table[:1].T.hist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "conclusion: 0.3 seems to be a good cut-off"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "movie_tags = genome_scores[genome_scores.relevance > 0.3][['movieId', 'tagId']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Merge in tag and movie names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tags_to_movies = pd.merge(movie_tags, genome_tags, on='tagId', how='left')[['movieId', 'tagId']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tags_to_movies['tagId'] = tags_to_movies.tagId.astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def _concatenate_tags_of_movie(tags):\n",
    "    tags_as_str = ' '.join(set(tags))\n",
    "    return tags_as_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tags_per_movie = tags_to_movies.groupby('movieId')['tagId'].agg({\n",
    "    'movie_tags': _concatenate_tags_of_movie\n",
    "}).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "avg_ratings = ratings.groupby('movieId')['rating'].agg({\n",
    "    'rating_mean': 'mean',\n",
    "    'rating_median': 'median',\n",
    "    'num_ratings': 'size'\n",
    "}).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "movies_with_ratings = pd.merge(movie_names, avg_ratings, how='left', on='movieId')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "dataset = pd.merge(movies_with_ratings, tags_per_movie, how='left', on='movieId')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "dataset.rename(columns={'median': 'rating_median', 'mean': 'rating_mean', 'tagId': 'movie_tags'}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Extracting movie year from title"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def extract_year_from_movie_title(movie_title):\n",
    "    matches = re.findall(r'\\d{4}', movie_title)\n",
    "    if len(matches) > 1:\n",
    "        return int(matches[-1])\n",
    "    if len(matches) < 1:\n",
    "        return np.nan\n",
    "    return int(matches[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "dataset['year'] = dataset.title.apply(extract_year_from_movie_title)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>movie_tags</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "      <td>63469.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.889300</td>\n",
       "      <td>113 93 1071 745 881 186 1025 464 588 355 942 1...</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "      <td>25045.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.229527</td>\n",
       "      <td>113 745 881 186 694 20 464 588 355 314 22 664 ...</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "      <td>15381.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.178142</td>\n",
       "      <td>1071 374 846 902 919 629 469 609 464 807 1057 ...</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "      <td>2961.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.879433</td>\n",
       "      <td>864 374 846 425 602 900 388 807 464 107 726 97...</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>15023.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.080410</td>\n",
       "      <td>1040 157 926 1071 204 864 374 334 902 694 919 ...</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId                               title  \\\n",
       "0        1                    Toy Story (1995)   \n",
       "1        2                      Jumanji (1995)   \n",
       "2        3             Grumpier Old Men (1995)   \n",
       "3        4            Waiting to Exhale (1995)   \n",
       "4        5  Father of the Bride Part II (1995)   \n",
       "\n",
       "                                        genres  num_ratings  rating_median  \\\n",
       "0  Adventure|Animation|Children|Comedy|Fantasy      63469.0            4.0   \n",
       "1                   Adventure|Children|Fantasy      25045.0            3.0   \n",
       "2                               Comedy|Romance      15381.0            3.0   \n",
       "3                         Comedy|Drama|Romance       2961.0            3.0   \n",
       "4                                       Comedy      15023.0            3.0   \n",
       "\n",
       "   rating_mean                                         movie_tags    year  \n",
       "0     3.889300  113 93 1071 745 881 186 1025 464 588 355 942 1...  1995.0  \n",
       "1     3.229527  113 745 881 186 694 20 464 588 355 314 22 664 ...  1995.0  \n",
       "2     3.178142  1071 374 846 902 919 629 469 609 464 807 1057 ...  1995.0  \n",
       "3     2.879433  864 374 846 425 602 900 388 807 464 107 726 97...  1995.0  \n",
       "4     3.080410  1040 157 926 1071 204 864 374 334 902 694 919 ...  1995.0  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "There are movies without tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>movie_tags</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>51</td>\n",
       "      <td>Guardian Angel (1994)</td>\n",
       "      <td>Action|Drama|Thriller</td>\n",
       "      <td>34.0</td>\n",
       "      <td>3.00</td>\n",
       "      <td>2.588235</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1994.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>109</td>\n",
       "      <td>Headless Body in Topless Bar (1995)</td>\n",
       "      <td>Comedy|Drama|Thriller</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.25</td>\n",
       "      <td>2.333333</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>115</td>\n",
       "      <td>Happiness Is in the Field (Bonheur est dans le...</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>50.0</td>\n",
       "      <td>4.00</td>\n",
       "      <td>3.380000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>124</td>\n",
       "      <td>Star Maker, The (Uomo delle stelle, L') (1995)</td>\n",
       "      <td>Drama</td>\n",
       "      <td>335.0</td>\n",
       "      <td>4.00</td>\n",
       "      <td>3.489552</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>127</td>\n",
       "      <td>Silences of the Palace, The (Saimt el Qusur) (...</td>\n",
       "      <td>Drama</td>\n",
       "      <td>51.0</td>\n",
       "      <td>3.00</td>\n",
       "      <td>3.215686</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1994.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     movieId                                              title  \\\n",
       "50        51                              Guardian Angel (1994)   \n",
       "107      109                Headless Body in Topless Bar (1995)   \n",
       "113      115  Happiness Is in the Field (Bonheur est dans le...   \n",
       "122      124     Star Maker, The (Uomo delle stelle, L') (1995)   \n",
       "125      127  Silences of the Palace, The (Saimt el Qusur) (...   \n",
       "\n",
       "                    genres  num_ratings  rating_median  rating_mean  \\\n",
       "50   Action|Drama|Thriller         34.0           3.00     2.588235   \n",
       "107  Comedy|Drama|Thriller         18.0           2.25     2.333333   \n",
       "113                 Comedy         50.0           4.00     3.380000   \n",
       "122                  Drama        335.0           4.00     3.489552   \n",
       "125                  Drama         51.0           3.00     3.215686   \n",
       "\n",
       "    movie_tags    year  \n",
       "50         NaN  1994.0  \n",
       "107        NaN  1995.0  \n",
       "113        NaN  1995.0  \n",
       "122        NaN  1995.0  \n",
       "125        NaN  1994.0  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[dataset.movie_tags.isnull()].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "There are movies without ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>movie_tags</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8531</th>\n",
       "      <td>25981</td>\n",
       "      <td>Man on a Tightrope (1953)</td>\n",
       "      <td>Drama</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1953.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9322</th>\n",
       "      <td>27396</td>\n",
       "      <td>Gentleman's Game, A (2002)</td>\n",
       "      <td>Drama</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2002.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9779</th>\n",
       "      <td>31797</td>\n",
       "      <td>White Banners (1938)</td>\n",
       "      <td>Drama</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1938.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13307</th>\n",
       "      <td>65078</td>\n",
       "      <td>Jane Austen in Manhattan (1980)</td>\n",
       "      <td>Drama|Romance</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1980.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13510</th>\n",
       "      <td>66622</td>\n",
       "      <td>His Private Secretary (1933)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1933.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       movieId                            title          genres  num_ratings  \\\n",
       "8531     25981        Man on a Tightrope (1953)           Drama          NaN   \n",
       "9322     27396       Gentleman's Game, A (2002)           Drama          NaN   \n",
       "9779     31797             White Banners (1938)           Drama          NaN   \n",
       "13307    65078  Jane Austen in Manhattan (1980)   Drama|Romance          NaN   \n",
       "13510    66622     His Private Secretary (1933)  Comedy|Romance          NaN   \n",
       "\n",
       "       rating_median  rating_mean movie_tags    year  \n",
       "8531             NaN          NaN        NaN  1953.0  \n",
       "9322             NaN          NaN        NaN  2002.0  \n",
       "9779             NaN          NaN        NaN  1938.0  \n",
       "13307            NaN          NaN        NaN  1980.0  \n",
       "13510            NaN          NaN        NaN  1933.0  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[dataset.rating_mean.isnull()].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Conclusion: These cannot be related to other movies due to lack of features (tags), they could be presented as a \"random recommendation\" solution"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Bag of words vectorization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "dataset_with_tags = dataset[~dataset.movie_tags.isnull()].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "bag_of_words = CountVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tags_as_descriptors = [' '.join(genome_tags.tagId.astype(str))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "movies_described_bag_of_words = bag_of_words.fit_transform(dataset_with_tags.movie_tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "df_bag_m2m = pd.DataFrame(cosine_similarity(movies_described_bag_of_words))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Tf-Idf Vectorization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tf_idf = TfidfVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "movies_tf_idf_described = tf_idf.fit_transform(dataset_with_tags.movie_tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "m2m = cosine_similarity(movies_tf_idf_described)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "df_tfidf_m2m = pd.DataFrame(cosine_similarity(movies_tf_idf_described))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>10664</th>\n",
       "      <th>10665</th>\n",
       "      <th>10666</th>\n",
       "      <th>10667</th>\n",
       "      <th>10668</th>\n",
       "      <th>10669</th>\n",
       "      <th>10670</th>\n",
       "      <th>10671</th>\n",
       "      <th>10672</th>\n",
       "      <th>10673</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.431993</td>\n",
       "      <td>0.159781</td>\n",
       "      <td>0.141310</td>\n",
       "      <td>0.216984</td>\n",
       "      <td>0.256402</td>\n",
       "      <td>0.219518</td>\n",
       "      <td>0.255039</td>\n",
       "      <td>0.084687</td>\n",
       "      <td>0.251235</td>\n",
       "      <td>...</td>\n",
       "      <td>0.253995</td>\n",
       "      <td>0.245707</td>\n",
       "      <td>0.298894</td>\n",
       "      <td>0.218980</td>\n",
       "      <td>0.243204</td>\n",
       "      <td>0.228843</td>\n",
       "      <td>0.338929</td>\n",
       "      <td>0.390264</td>\n",
       "      <td>0.527402</td>\n",
       "      <td>0.210074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.431993</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.107847</td>\n",
       "      <td>0.120763</td>\n",
       "      <td>0.237059</td>\n",
       "      <td>0.120485</td>\n",
       "      <td>0.181958</td>\n",
       "      <td>0.282949</td>\n",
       "      <td>0.188213</td>\n",
       "      <td>0.234384</td>\n",
       "      <td>...</td>\n",
       "      <td>0.365564</td>\n",
       "      <td>0.130227</td>\n",
       "      <td>0.221158</td>\n",
       "      <td>0.171833</td>\n",
       "      <td>0.172612</td>\n",
       "      <td>0.160411</td>\n",
       "      <td>0.251052</td>\n",
       "      <td>0.265634</td>\n",
       "      <td>0.299784</td>\n",
       "      <td>0.181168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.159781</td>\n",
       "      <td>0.107847</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.260063</td>\n",
       "      <td>0.430876</td>\n",
       "      <td>0.118901</td>\n",
       "      <td>0.369714</td>\n",
       "      <td>0.206708</td>\n",
       "      <td>0.067116</td>\n",
       "      <td>0.175896</td>\n",
       "      <td>...</td>\n",
       "      <td>0.151341</td>\n",
       "      <td>0.135469</td>\n",
       "      <td>0.134628</td>\n",
       "      <td>0.093538</td>\n",
       "      <td>0.102329</td>\n",
       "      <td>0.123552</td>\n",
       "      <td>0.142192</td>\n",
       "      <td>0.124184</td>\n",
       "      <td>0.133702</td>\n",
       "      <td>0.096047</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.141310</td>\n",
       "      <td>0.120763</td>\n",
       "      <td>0.260063</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.263465</td>\n",
       "      <td>0.088910</td>\n",
       "      <td>0.374638</td>\n",
       "      <td>0.244020</td>\n",
       "      <td>0.077906</td>\n",
       "      <td>0.075830</td>\n",
       "      <td>...</td>\n",
       "      <td>0.144143</td>\n",
       "      <td>0.136691</td>\n",
       "      <td>0.119444</td>\n",
       "      <td>0.108307</td>\n",
       "      <td>0.110534</td>\n",
       "      <td>0.113956</td>\n",
       "      <td>0.159768</td>\n",
       "      <td>0.165774</td>\n",
       "      <td>0.165368</td>\n",
       "      <td>0.079979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.216984</td>\n",
       "      <td>0.237059</td>\n",
       "      <td>0.430876</td>\n",
       "      <td>0.263465</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.050463</td>\n",
       "      <td>0.446251</td>\n",
       "      <td>0.182284</td>\n",
       "      <td>0.108008</td>\n",
       "      <td>0.137105</td>\n",
       "      <td>...</td>\n",
       "      <td>0.180975</td>\n",
       "      <td>0.204900</td>\n",
       "      <td>0.114578</td>\n",
       "      <td>0.107630</td>\n",
       "      <td>0.137742</td>\n",
       "      <td>0.105655</td>\n",
       "      <td>0.173343</td>\n",
       "      <td>0.187519</td>\n",
       "      <td>0.182170</td>\n",
       "      <td>0.098338</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 10674 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      0         1         2         3         4         5         6      \\\n",
       "0  1.000000  0.431993  0.159781  0.141310  0.216984  0.256402  0.219518   \n",
       "1  0.431993  1.000000  0.107847  0.120763  0.237059  0.120485  0.181958   \n",
       "2  0.159781  0.107847  1.000000  0.260063  0.430876  0.118901  0.369714   \n",
       "3  0.141310  0.120763  0.260063  1.000000  0.263465  0.088910  0.374638   \n",
       "4  0.216984  0.237059  0.430876  0.263465  1.000000  0.050463  0.446251   \n",
       "\n",
       "      7         8         9        ...        10664     10665     10666  \\\n",
       "0  0.255039  0.084687  0.251235    ...     0.253995  0.245707  0.298894   \n",
       "1  0.282949  0.188213  0.234384    ...     0.365564  0.130227  0.221158   \n",
       "2  0.206708  0.067116  0.175896    ...     0.151341  0.135469  0.134628   \n",
       "3  0.244020  0.077906  0.075830    ...     0.144143  0.136691  0.119444   \n",
       "4  0.182284  0.108008  0.137105    ...     0.180975  0.204900  0.114578   \n",
       "\n",
       "      10667     10668     10669     10670     10671     10672     10673  \n",
       "0  0.218980  0.243204  0.228843  0.338929  0.390264  0.527402  0.210074  \n",
       "1  0.171833  0.172612  0.160411  0.251052  0.265634  0.299784  0.181168  \n",
       "2  0.093538  0.102329  0.123552  0.142192  0.124184  0.133702  0.096047  \n",
       "3  0.108307  0.110534  0.113956  0.159768  0.165774  0.165368  0.079979  \n",
       "4  0.107630  0.137742  0.105655  0.173343  0.187519  0.182170  0.098338  \n",
       "\n",
       "[5 rows x 10674 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tfidf_m2m.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Match indices to movie IDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "index_to_movie_id = dataset_with_tags['movieId']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_to_movie_id[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "160980"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_to_movie_id[10665]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "movieId                                                         52\n",
       "title                                      Mighty Aphrodite (1995)\n",
       "genres                                        Comedy|Drama|Romance\n",
       "num_ratings                                                  10277\n",
       "rating_median                                                    4\n",
       "rating_mean                                                3.53741\n",
       "movie_tags       829 335 1071 745 845 297 704 609 464 726 1062 ...\n",
       "year                                                          1995\n",
       "Name: 50, dtype: object"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_with_tags.reset_index(drop=True).ix[50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>...</th>\n",
       "      <th>160954</th>\n",
       "      <th>160980</th>\n",
       "      <th>161131</th>\n",
       "      <th>161354</th>\n",
       "      <th>161582</th>\n",
       "      <th>161634</th>\n",
       "      <th>162350</th>\n",
       "      <th>162376</th>\n",
       "      <th>162578</th>\n",
       "      <th>162600</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.431993</td>\n",
       "      <td>0.159781</td>\n",
       "      <td>0.141310</td>\n",
       "      <td>0.216984</td>\n",
       "      <td>0.256402</td>\n",
       "      <td>0.219518</td>\n",
       "      <td>0.255039</td>\n",
       "      <td>0.084687</td>\n",
       "      <td>0.251235</td>\n",
       "      <td>...</td>\n",
       "      <td>0.253995</td>\n",
       "      <td>0.245707</td>\n",
       "      <td>0.298894</td>\n",
       "      <td>0.218980</td>\n",
       "      <td>0.243204</td>\n",
       "      <td>0.228843</td>\n",
       "      <td>0.338929</td>\n",
       "      <td>0.390264</td>\n",
       "      <td>0.527402</td>\n",
       "      <td>0.210074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.431993</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.107847</td>\n",
       "      <td>0.120763</td>\n",
       "      <td>0.237059</td>\n",
       "      <td>0.120485</td>\n",
       "      <td>0.181958</td>\n",
       "      <td>0.282949</td>\n",
       "      <td>0.188213</td>\n",
       "      <td>0.234384</td>\n",
       "      <td>...</td>\n",
       "      <td>0.365564</td>\n",
       "      <td>0.130227</td>\n",
       "      <td>0.221158</td>\n",
       "      <td>0.171833</td>\n",
       "      <td>0.172612</td>\n",
       "      <td>0.160411</td>\n",
       "      <td>0.251052</td>\n",
       "      <td>0.265634</td>\n",
       "      <td>0.299784</td>\n",
       "      <td>0.181168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.159781</td>\n",
       "      <td>0.107847</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.260063</td>\n",
       "      <td>0.430876</td>\n",
       "      <td>0.118901</td>\n",
       "      <td>0.369714</td>\n",
       "      <td>0.206708</td>\n",
       "      <td>0.067116</td>\n",
       "      <td>0.175896</td>\n",
       "      <td>...</td>\n",
       "      <td>0.151341</td>\n",
       "      <td>0.135469</td>\n",
       "      <td>0.134628</td>\n",
       "      <td>0.093538</td>\n",
       "      <td>0.102329</td>\n",
       "      <td>0.123552</td>\n",
       "      <td>0.142192</td>\n",
       "      <td>0.124184</td>\n",
       "      <td>0.133702</td>\n",
       "      <td>0.096047</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.141310</td>\n",
       "      <td>0.120763</td>\n",
       "      <td>0.260063</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.263465</td>\n",
       "      <td>0.088910</td>\n",
       "      <td>0.374638</td>\n",
       "      <td>0.244020</td>\n",
       "      <td>0.077906</td>\n",
       "      <td>0.075830</td>\n",
       "      <td>...</td>\n",
       "      <td>0.144143</td>\n",
       "      <td>0.136691</td>\n",
       "      <td>0.119444</td>\n",
       "      <td>0.108307</td>\n",
       "      <td>0.110534</td>\n",
       "      <td>0.113956</td>\n",
       "      <td>0.159768</td>\n",
       "      <td>0.165774</td>\n",
       "      <td>0.165368</td>\n",
       "      <td>0.079979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.216984</td>\n",
       "      <td>0.237059</td>\n",
       "      <td>0.430876</td>\n",
       "      <td>0.263465</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.050463</td>\n",
       "      <td>0.446251</td>\n",
       "      <td>0.182284</td>\n",
       "      <td>0.108008</td>\n",
       "      <td>0.137105</td>\n",
       "      <td>...</td>\n",
       "      <td>0.180975</td>\n",
       "      <td>0.204900</td>\n",
       "      <td>0.114578</td>\n",
       "      <td>0.107630</td>\n",
       "      <td>0.137742</td>\n",
       "      <td>0.105655</td>\n",
       "      <td>0.173343</td>\n",
       "      <td>0.187519</td>\n",
       "      <td>0.182170</td>\n",
       "      <td>0.098338</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 10674 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          1         2         3         4         5         6         7  \\\n",
       "1  1.000000  0.431993  0.159781  0.141310  0.216984  0.256402  0.219518   \n",
       "2  0.431993  1.000000  0.107847  0.120763  0.237059  0.120485  0.181958   \n",
       "3  0.159781  0.107847  1.000000  0.260063  0.430876  0.118901  0.369714   \n",
       "4  0.141310  0.120763  0.260063  1.000000  0.263465  0.088910  0.374638   \n",
       "5  0.216984  0.237059  0.430876  0.263465  1.000000  0.050463  0.446251   \n",
       "\n",
       "          8         9        10    ...       160954    160980    161131  \\\n",
       "1  0.255039  0.084687  0.251235    ...     0.253995  0.245707  0.298894   \n",
       "2  0.282949  0.188213  0.234384    ...     0.365564  0.130227  0.221158   \n",
       "3  0.206708  0.067116  0.175896    ...     0.151341  0.135469  0.134628   \n",
       "4  0.244020  0.077906  0.075830    ...     0.144143  0.136691  0.119444   \n",
       "5  0.182284  0.108008  0.137105    ...     0.180975  0.204900  0.114578   \n",
       "\n",
       "     161354    161582    161634    162350    162376    162578    162600  \n",
       "1  0.218980  0.243204  0.228843  0.338929  0.390264  0.527402  0.210074  \n",
       "2  0.171833  0.172612  0.160411  0.251052  0.265634  0.299784  0.181168  \n",
       "3  0.093538  0.102329  0.123552  0.142192  0.124184  0.133702  0.096047  \n",
       "4  0.108307  0.110534  0.113956  0.159768  0.165774  0.165368  0.079979  \n",
       "5  0.107630  0.137742  0.105655  0.173343  0.187519  0.182170  0.098338  \n",
       "\n",
       "[5 rows x 10674 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tfidf_m2m.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>...</th>\n",
       "      <th>160954</th>\n",
       "      <th>160980</th>\n",
       "      <th>161131</th>\n",
       "      <th>161354</th>\n",
       "      <th>161582</th>\n",
       "      <th>161634</th>\n",
       "      <th>162350</th>\n",
       "      <th>162376</th>\n",
       "      <th>162578</th>\n",
       "      <th>162600</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>161634</th>\n",
       "      <td>0.228843</td>\n",
       "      <td>0.160411</td>\n",
       "      <td>0.123552</td>\n",
       "      <td>0.113956</td>\n",
       "      <td>0.105655</td>\n",
       "      <td>0.384652</td>\n",
       "      <td>0.118860</td>\n",
       "      <td>0.114861</td>\n",
       "      <td>0.092883</td>\n",
       "      <td>0.189042</td>\n",
       "      <td>...</td>\n",
       "      <td>0.185566</td>\n",
       "      <td>0.168822</td>\n",
       "      <td>0.256561</td>\n",
       "      <td>0.194614</td>\n",
       "      <td>0.274796</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.258832</td>\n",
       "      <td>0.450956</td>\n",
       "      <td>0.319277</td>\n",
       "      <td>0.097457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162350</th>\n",
       "      <td>0.338929</td>\n",
       "      <td>0.251052</td>\n",
       "      <td>0.142192</td>\n",
       "      <td>0.159768</td>\n",
       "      <td>0.173343</td>\n",
       "      <td>0.355809</td>\n",
       "      <td>0.191053</td>\n",
       "      <td>0.226337</td>\n",
       "      <td>0.164536</td>\n",
       "      <td>0.232459</td>\n",
       "      <td>...</td>\n",
       "      <td>0.275451</td>\n",
       "      <td>0.195042</td>\n",
       "      <td>0.404297</td>\n",
       "      <td>0.320394</td>\n",
       "      <td>0.381101</td>\n",
       "      <td>0.258832</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.367129</td>\n",
       "      <td>0.316950</td>\n",
       "      <td>0.253319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162376</th>\n",
       "      <td>0.390264</td>\n",
       "      <td>0.265634</td>\n",
       "      <td>0.124184</td>\n",
       "      <td>0.165774</td>\n",
       "      <td>0.187519</td>\n",
       "      <td>0.451733</td>\n",
       "      <td>0.160788</td>\n",
       "      <td>0.161037</td>\n",
       "      <td>0.083897</td>\n",
       "      <td>0.196478</td>\n",
       "      <td>...</td>\n",
       "      <td>0.269517</td>\n",
       "      <td>0.265497</td>\n",
       "      <td>0.398473</td>\n",
       "      <td>0.213949</td>\n",
       "      <td>0.455381</td>\n",
       "      <td>0.450956</td>\n",
       "      <td>0.367129</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.555852</td>\n",
       "      <td>0.204455</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162578</th>\n",
       "      <td>0.527402</td>\n",
       "      <td>0.299784</td>\n",
       "      <td>0.133702</td>\n",
       "      <td>0.165368</td>\n",
       "      <td>0.182170</td>\n",
       "      <td>0.340640</td>\n",
       "      <td>0.183740</td>\n",
       "      <td>0.229345</td>\n",
       "      <td>0.099949</td>\n",
       "      <td>0.177231</td>\n",
       "      <td>...</td>\n",
       "      <td>0.222804</td>\n",
       "      <td>0.279660</td>\n",
       "      <td>0.314855</td>\n",
       "      <td>0.246288</td>\n",
       "      <td>0.333284</td>\n",
       "      <td>0.319277</td>\n",
       "      <td>0.316950</td>\n",
       "      <td>0.555852</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.209276</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162600</th>\n",
       "      <td>0.210074</td>\n",
       "      <td>0.181168</td>\n",
       "      <td>0.096047</td>\n",
       "      <td>0.079979</td>\n",
       "      <td>0.098338</td>\n",
       "      <td>0.069668</td>\n",
       "      <td>0.069561</td>\n",
       "      <td>0.117470</td>\n",
       "      <td>0.121824</td>\n",
       "      <td>0.128960</td>\n",
       "      <td>...</td>\n",
       "      <td>0.208857</td>\n",
       "      <td>0.116181</td>\n",
       "      <td>0.166620</td>\n",
       "      <td>0.222607</td>\n",
       "      <td>0.126180</td>\n",
       "      <td>0.097457</td>\n",
       "      <td>0.253319</td>\n",
       "      <td>0.204455</td>\n",
       "      <td>0.209276</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 10674 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               1         2         3         4         5         6         7  \\\n",
       "161634  0.228843  0.160411  0.123552  0.113956  0.105655  0.384652  0.118860   \n",
       "162350  0.338929  0.251052  0.142192  0.159768  0.173343  0.355809  0.191053   \n",
       "162376  0.390264  0.265634  0.124184  0.165774  0.187519  0.451733  0.160788   \n",
       "162578  0.527402  0.299784  0.133702  0.165368  0.182170  0.340640  0.183740   \n",
       "162600  0.210074  0.181168  0.096047  0.079979  0.098338  0.069668  0.069561   \n",
       "\n",
       "               8         9        10    ...       160954    160980    161131  \\\n",
       "161634  0.114861  0.092883  0.189042    ...     0.185566  0.168822  0.256561   \n",
       "162350  0.226337  0.164536  0.232459    ...     0.275451  0.195042  0.404297   \n",
       "162376  0.161037  0.083897  0.196478    ...     0.269517  0.265497  0.398473   \n",
       "162578  0.229345  0.099949  0.177231    ...     0.222804  0.279660  0.314855   \n",
       "162600  0.117470  0.121824  0.128960    ...     0.208857  0.116181  0.166620   \n",
       "\n",
       "          161354    161582    161634    162350    162376    162578    162600  \n",
       "161634  0.194614  0.274796  1.000000  0.258832  0.450956  0.319277  0.097457  \n",
       "162350  0.320394  0.381101  0.258832  1.000000  0.367129  0.316950  0.253319  \n",
       "162376  0.213949  0.455381  0.450956  0.367129  1.000000  0.555852  0.204455  \n",
       "162578  0.246288  0.333284  0.319277  0.316950  0.555852  1.000000  0.209276  \n",
       "162600  0.222607  0.126180  0.097457  0.253319  0.204455  0.209276  1.000000  \n",
       "\n",
       "[5 rows x 10674 columns]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tfidf_m2m.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Check similarities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1         1.000000\n",
       "3114      0.736535\n",
       "4886      0.724898\n",
       "78499     0.720759\n",
       "2355      0.714265\n",
       "76093     0.686358\n",
       "5218      0.672731\n",
       "68954     0.648925\n",
       "6377      0.647981\n",
       "4306      0.641639\n",
       "50872     0.635019\n",
       "8961      0.626673\n",
       "2761      0.608781\n",
       "81847     0.604853\n",
       "45517     0.602484\n",
       "152081    0.601611\n",
       "42191     0.599953\n",
       "97913     0.594801\n",
       "98491     0.591688\n",
       "108932    0.590483\n",
       "Name: 1, dtype: float64"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tfidf_m2m.ix[1].sort_values(ascending=False)[:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>movie_tags</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "      <td>63469.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.8893</td>\n",
       "      <td>113 93 1071 745 881 186 1025 464 588 355 942 1...</td>\n",
       "      <td>1995.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId             title                                       genres  \\\n",
       "0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   \n",
       "\n",
       "   num_ratings  rating_median  rating_mean  \\\n",
       "0      63469.0            4.0       3.8893   \n",
       "\n",
       "                                          movie_tags    year  \n",
       "0  113 93 1071 745 881 186 1025 464 588 355 942 1...  1995.0  "
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_with_tags[dataset_with_tags.movieId == 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>movie_tags</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2769</th>\n",
       "      <td>3114</td>\n",
       "      <td>Toy Story 2 (1999)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "      <td>26904.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.820844</td>\n",
       "      <td>1071 745 186 464 588 355 1062 664 244 455 128 ...</td>\n",
       "      <td>1999.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId               title  \\\n",
       "2769     3114  Toy Story 2 (1999)   \n",
       "\n",
       "                                           genres  num_ratings  rating_median  \\\n",
       "2769  Adventure|Animation|Children|Comedy|Fantasy      26904.0            4.0   \n",
       "\n",
       "      rating_mean                                         movie_tags    year  \n",
       "2769     3.820844  1071 745 186 464 588 355 1062 664 244 455 128 ...  1999.0  "
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_with_tags[dataset_with_tags.movieId == 3114]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>movie_tags</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4331</th>\n",
       "      <td>4886</td>\n",
       "      <td>Monsters, Inc. (2001)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "      <td>31089.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.863633</td>\n",
       "      <td>113 1071 745 186 464 588 355 1062 755 22 372 6...</td>\n",
       "      <td>2001.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId                  title  \\\n",
       "4331     4886  Monsters, Inc. (2001)   \n",
       "\n",
       "                                           genres  num_ratings  rating_median  \\\n",
       "4331  Adventure|Animation|Children|Comedy|Fantasy      31089.0            4.0   \n",
       "\n",
       "      rating_mean                                         movie_tags    year  \n",
       "4331     3.863633  113 1071 745 186 464 588 355 1062 755 22 372 6...  2001.0  "
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_with_tags[dataset_with_tags.movieId == 4886]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>movie_tags</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9070</th>\n",
       "      <td>78499</td>\n",
       "      <td>Toy Story 3 (2010)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy|IMAX</td>\n",
       "      <td>10963.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.910244</td>\n",
       "      <td>1071 864 745 881 186 845 464 588 355 1062 755 ...</td>\n",
       "      <td>2010.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId               title  \\\n",
       "9070    78499  Toy Story 3 (2010)   \n",
       "\n",
       "                                                genres  num_ratings  \\\n",
       "9070  Adventure|Animation|Children|Comedy|Fantasy|IMAX      10963.0   \n",
       "\n",
       "      rating_median  rating_mean  \\\n",
       "9070            4.0     3.910244   \n",
       "\n",
       "                                             movie_tags    year  \n",
       "9070  1071 864 745 881 186 845 464 588 355 1062 755 ...  2010.0  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_with_tags[dataset_with_tags.movieId == 78499]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The closest movies to Toy Story 1 are the sequels and Monsters Inc! (No sh*t sherlock)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Stacking for writing to DB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "m2m_similarity_stacked = df_tfidf_m2m.stack().reset_index()\n",
    "m2m_similarity_stacked.columns = ['first_movie', 'second_movie', 'similarity_score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>first_movie</th>\n",
       "      <th>second_movie</th>\n",
       "      <th>similarity_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0.431993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0.159781</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0.141310</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0.216984</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   first_movie second_movie  similarity_score\n",
       "0            1            1          1.000000\n",
       "1            1            2          0.431993\n",
       "2            1            3          0.159781\n",
       "3            1            4          0.141310\n",
       "4            1            5          0.216984"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m2m_similarity_stacked.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>first_movie</th>\n",
       "      <th>second_movie</th>\n",
       "      <th>similarity_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>113934271</th>\n",
       "      <td>162600</td>\n",
       "      <td>161634</td>\n",
       "      <td>0.097457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113934272</th>\n",
       "      <td>162600</td>\n",
       "      <td>162350</td>\n",
       "      <td>0.253319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113934273</th>\n",
       "      <td>162600</td>\n",
       "      <td>162376</td>\n",
       "      <td>0.204455</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113934274</th>\n",
       "      <td>162600</td>\n",
       "      <td>162578</td>\n",
       "      <td>0.209276</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113934275</th>\n",
       "      <td>162600</td>\n",
       "      <td>162600</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           first_movie second_movie  similarity_score\n",
       "113934271       162600       161634          0.097457\n",
       "113934272       162600       162350          0.253319\n",
       "113934273       162600       162376          0.204455\n",
       "113934274       162600       162578          0.209276\n",
       "113934275       162600       162600          1.000000"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m2m_similarity_stacked.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Writing to DB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import sqlite3 as db"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "connection = db.connect('db.sqlite3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "for_db = dataset.rename(columns={\n",
    "    'movieId': 'movie_id'\n",
    "})[['movie_id', 'title', 'year', 'genres', 'num_ratings', 'rating_median', 'rating_mean']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "for_db['relatable'] = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>relatable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>1995.0</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "      <td>63469.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.889300</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>1995.0</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "      <td>25045.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.229527</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>1995.0</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "      <td>15381.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.178142</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>1995.0</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "      <td>2961.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.879433</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>1995.0</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>15023.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.080410</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movie_id                               title    year  \\\n",
       "0         1                    Toy Story (1995)  1995.0   \n",
       "1         2                      Jumanji (1995)  1995.0   \n",
       "2         3             Grumpier Old Men (1995)  1995.0   \n",
       "3         4            Waiting to Exhale (1995)  1995.0   \n",
       "4         5  Father of the Bride Part II (1995)  1995.0   \n",
       "\n",
       "                                        genres  num_ratings  rating_median  \\\n",
       "0  Adventure|Animation|Children|Comedy|Fantasy      63469.0            4.0   \n",
       "1                   Adventure|Children|Fantasy      25045.0            3.0   \n",
       "2                               Comedy|Romance      15381.0            3.0   \n",
       "3                         Comedy|Drama|Romance       2961.0            3.0   \n",
       "4                                       Comedy      15023.0            3.0   \n",
       "\n",
       "   rating_mean relatable  \n",
       "0     3.889300      True  \n",
       "1     3.229527      True  \n",
       "2     3.178142      True  \n",
       "3     2.879433      True  \n",
       "4     3.080410      True  "
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for_db.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "40501it [00:01, 38515.09it/s]                           \n"
     ]
    }
   ],
   "source": [
    "total_length = len(for_db)\n",
    "step = int(total_length / 100)\n",
    "\n",
    "with tqdm(total=total_length) as pbar:\n",
    "    for i in range(0, total_length, step):\n",
    "        subset = for_db[i: i+step]\n",
    "        subset.to_sql('movie_time_app_movie', connection, if_exists='append', index=False)\n",
    "        pbar.update(step)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "      <th>poster</th>\n",
       "      <th>year</th>\n",
       "      <th>genres</th>\n",
       "      <th>num_ratings</th>\n",
       "      <th>rating_median</th>\n",
       "      <th>rating_mean</th>\n",
       "      <th>relatable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>None</td>\n",
       "      <td>1995</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "      <td>63469</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.889300</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>None</td>\n",
       "      <td>1995</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "      <td>25045</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.229527</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>None</td>\n",
       "      <td>1995</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "      <td>15381</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.178142</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>None</td>\n",
       "      <td>1995</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "      <td>2961</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.879433</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>None</td>\n",
       "      <td>1995</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>15023</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.080410</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movie_id                               title poster  year  \\\n",
       "0         1                    Toy Story (1995)   None  1995   \n",
       "1         2                      Jumanji (1995)   None  1995   \n",
       "2         3             Grumpier Old Men (1995)   None  1995   \n",
       "3         4            Waiting to Exhale (1995)   None  1995   \n",
       "4         5  Father of the Bride Part II (1995)   None  1995   \n",
       "\n",
       "                                        genres  num_ratings  rating_median  \\\n",
       "0  Adventure|Animation|Children|Comedy|Fantasy        63469            4.0   \n",
       "1                   Adventure|Children|Fantasy        25045            3.0   \n",
       "2                               Comedy|Romance        15381            3.0   \n",
       "3                         Comedy|Drama|Romance         2961            3.0   \n",
       "4                                       Comedy        15023            3.0   \n",
       "\n",
       "   rating_mean  relatable  \n",
       "0     3.889300          1  \n",
       "1     3.229527          1  \n",
       "2     3.178142          1  \n",
       "3     2.879433          1  \n",
       "4     3.080410          1  "
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_sql_query('SELECT * FROM movie_time_app_movie LIMIT 5', connection)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
